import re


with open('./sup/cx.html', 'r', encoding='utf8') as f:
    cx = f.read()
    cx = cx.replace(' ', '').replace('\t', '').replace('\n', '')

ke_id = re.findall('<span.?class="pl5pr.*?">(.*?)</span>', cx)
ke_url_name = re.findall(r'<a.?class=".*?".*?href="\?(.*?)".*?>(.*?)</a>', cx)

with open('./sup/listURL.txt', 'w', encoding='utf8') as f:
    src = 'http://mooc1.chaoxing.com/nodedetailcontroller/visitnodedetail?'
    [f.write(src+l + '\t' + ke_id[ke_url_name.index((l, n))] + '\t'+n + '\n')
     for l, n in ke_url_name if not n.startswith('下一页')]

print(len(ke_id), ke_id)
print(len(ke_url_name), ke_url_name)


# print(cx)
# ke_url = re.findall(r'<a class=".*?".*?href="\?(.*?)"', cx)
# ke_name = re.findall(r'                                               	                                                    (.*?)\n', cx)
# print(len(ke_url), ke_url)
# print(len(ke_name), ke_name)
#
# with open('./sup/listURL.txt', 'w', encoding='utf8') as l:
#     for i in range(len(ke_name)):
#         s = ''
#         if i <= 46: s = s + ke_name[i]
#         s += '\t'
#         if i <= 47: s = s + 'http://mooc1.chaoxing.com/nodedetailcontroller/visitnodedetail?'+ke_url[i]
#         s += '\n'
#         l.write(s)
